import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import seaborn as sns
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly
%matplotlib inline
import plotly.io as pio
pio.renderers.default='notebook'
data = pd.read_csv('C:/Users/HughOOZ/Downloads/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.head()
print('row:',data.shape[0])
print('columns:',data.shape[1])
print('\nfeatures:',data.columns.tolist())
print('\nmissing values:',data.isnull().sum().values.sum())
print('\nunique values:','\n',data.nunique())
def age(data) :
if data["Age"] <= 25 :
return "Age_0-25"
elif (data["Age"] > 25) and (data["Age"] <= 35 ):
return "Age_25-35"
elif (data["Age"] > 35) and (data["Age"] <= 45) :
return "Age_35-45"
elif (data["Age"] > 45) and (data["Age"] <= 55) :
return "Age_45-55"
elif data["Age"] > 55 :
return "Age_55-60"
data["age_group"] = data.apply(lambda x:age(x), axis=1)
data = data.drop(['StandardHours','Over18','EmployeeCount'], axis=1)
attrition = data[data['Attrition'] == 'Yes']
not_attrition = data[data['Attrition'] == 'No']
Id_col = ['EmployeeNumber']
label_col = ['Attrition']
cat_cols = data.nunique()[data.nunique() < 10].keys().tolist()
cat_cols = [x for x in cat_cols if x not in label_col]
num_cols = [x for x in data.columns if x not in cat_cols + label_col + Id_col]
print(cat_cols,'\n',num_cols)
lab = data['Attrition'].value_counts().keys().tolist()
val = data['Attrition'].value_counts().values.tolist()
trace = go.Pie(labels = lab ,
values = val ,
marker = dict(colors = [ 'royalblue' ,'lime'],
line = dict(color = "white",
width = 1.3)
),
rotation = 90,
hoverinfo = "label+value+text",
hole = .5
)
layout = go.Layout(dict(title = "Employee Attrition in data",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)
plt.show()
def plot_pie(column) :
trace1 = go.Pie(values = attrition[column].value_counts().values.tolist(),
labels = not_attrition[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "attrition",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
hole = .6
)
trace2 = go.Pie(values = not_attrition[column].value_counts().values.tolist(),
labels = not_attrition[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
domain = dict(x = [.52,1]),
hole = .6,
name = "not attrition"
)
layout = go.Layout(dict(title = column + " distribution in attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
annotations = [dict(text = "attrition",
font = dict(size = 13),
showarrow = False,
x = .15, y = .5),
dict(text = "not attrition",
font = dict(size = 13),
showarrow = False,
x = .88,y = .5
)
]
)
)
data = [trace1,trace2]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
def histogram(column) :
trace1 = go.Histogram(x = attrition[column],
histnorm= "percent",
name = "attrition",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = not_attrition[column],
histnorm = "percent",
name = "Not attrition",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1,trace2]
layout = go.Layout(dict(title =column + " distribution in attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
for i in cat_cols :
plot_pie(i)
for i in num_cols :
histogram(i)
def scatter_matrix(df) :
df = df.sort_values(by = "Attrition" ,ascending = True)
classes = df["Attrition"].unique().tolist()
classes
class_code = {classes[k] : k for k in range(2)}
class_code
color_vals = [class_code[cl] for cl in df["Attrition"]]
color_vals
pl_colorscale = "Portland"
pl_colorscale
text = [df.loc[k,"Attrition"] for k in range(len(df))]
text
trace = go.Splom(dimensions = [dict(label = x,values = df[x]) for x in ['MonthlyIncome','MonthlyRate','DistanceFromHome']],
text = text,
marker = dict(color = color_vals,
colorscale = pl_colorscale,
size = 3,
showscale = False,
line = dict(width = .1,
color='rgb(230,230,230)'
)
)
)
axis = dict(showline = True,
zeroline = False,
gridcolor = "#fff",
ticklen = 4
)
layout = go.Layout(dict(title =
"Scatter plot matrix for Numerical columns for attrition",
autosize = False,
height = 800,
width = 800,
dragmode = "select",
hovermode = "closest",
plot_bgcolor = 'rgba(240,240,240, 0.95)',
xaxis1 = dict(axis),
yaxis1 = dict(axis),
xaxis2 = dict(axis),
yaxis2 = dict(axis),
xaxis3 = dict(axis),
yaxis3 = dict(axis)
)
)
plt_data = [trace]
fig = go.Figure(data = plt_data,layout = layout )
py.iplot(fig)
scatter_matrix(data)
age_at = attrition['age_group'].value_counts().reset_index()
age_at.columns = ["age_group","count"]
age_nat = not_attrition["age_group"].value_counts().reset_index()
age_nat.columns = ["age_group","count"]
trace1 = go.Bar(x = age_at["age_group"] , y = age_at["count"],
name = "Age_Attrition",
marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
trace2 = go.Bar(x = age_nat["age_group"] , y = age_nat["count"],
name = "Age_NotAttrition",
marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
layout = go.Layout(dict(title = " attrition in age groups",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "age group",
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "count",
zerolinewidth=1,ticklen=5,gridwidth=2),
)
)
plt_data = [trace1,trace2]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
Id_col = ['EmployeeNumber']
target_col = ["Attrition"]
cat_cols = data.nunique()[data.nunique() < 10].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
num_cols = [x for x in data.columns if x not in cat_cols + target_col + Id_col]
bin_cols = data.nunique()[data.nunique() == 2].keys().tolist()
multi_cols = [i for i in cat_cols if i not in bin_cols]
le = LabelEncoder()
for i in bin_cols :
data[i] = le.fit_transform(data[i])
data = pd.get_dummies(data = data,columns = multi_cols )
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
summary = (df_data_og[[i for i in df_data_og.columns if i not in Id_col]].
describe().transpose().reset_index())
summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)
val_lst = [summary['feature'], summary['count'],
summary['mean'],summary['std'],
summary['min'], summary['25%'],
summary['50%'], summary['75%'], summary['max']]
trace = go.Table(header = dict(values = summary.columns.tolist(),
line = dict(color = ['#506784']),
fill = dict(color = ['#119DFF']),
),
cells = dict(values = val_lst,
line = dict(color = ['#506784']),
fill = dict(color = ["lightgrey",'#F5F8FF'])
),
columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)
correlation = data.corr()
matrix_cols = correlation.columns.tolist()
corr_array = np.array(correlation)
trace = go.Heatmap(z = corr_array,
x = matrix_cols,
y = matrix_cols,
colorscale = "Viridis",
colorbar = dict(title = "Pearson Correlation coefficient",
titleside = "right"
) ,
)
layout = go.Layout(dict(title = "Correlation Matrix for variables",
autosize = False,
height = 720,
width = 800,
margin = dict(r = 0 ,l = 210,
t = 25,b = 210,
),
yaxis = dict(tickfont = dict(size = 9)),
xaxis = dict(tickfont = dict(size = 9))
)
)
plt_data = [trace]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X = data[[i for i in data.columns if i not in Id_col + target_col]]
Y = data[target_col + Id_col]
principal_components = pca.fit_transform(X)
pca_data = pd.DataFrame(principal_components,columns = ["PC1","PC2"])
pca_data = pca_data.merge(Y,left_index=True,right_index=True,how="left")
pca_data["Attrition"] = pca_data["Attrition"].replace({1:"Attrition",0:"Not Attrition"})
pca_data
def pca_scatter(target,color) :
tracer = go.Scatter(x = pca_data[pca_data["Attrition"] == target]["PC1"] ,
y = pca_data[pca_data["Attrition"] == target]["PC2"],
name = target,mode = "markers",
marker = dict(color = color,
line = dict(width = .5),
symbol = "diamond-open"),
text = ("EmployeeNumber : " +
str(pca_data[pca_data["Attrition"] == target]['EmployeeNumber']))
)
return tracer
layout = go.Layout(dict(title = "Visualising data with principal components",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "principal component 1",
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "principal component 2",
zerolinewidth=1,ticklen=5,gridwidth=2),
height = 600
)
)
trace1 = pca_scatter('Attrition','red')
trace2 = pca_scatter('Not Attrition','royalblue')
plt_data = [trace2,trace1]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
bi_cs = data.nunique()[data.nunique() == 2].keys()
dat_rad = data[bi_cs]
def plot_radar(df,aggregate,title) :
data_frame = df[df["Attrition"] == aggregate]
data_frame_x = data_frame[bi_cs].sum().reset_index()
data_frame_x.columns = ["feature","yes"]
data_frame_x["no"] = data_frame.shape[0] - data_frame_x["yes"]
data_frame_x = data_frame_x[data_frame_x["feature"] != "Attrition"]
#count of 1's(yes)
trace1 = go.Scatterpolar(r = data_frame_x["yes"].values.tolist(),
theta = data_frame_x["feature"].tolist(),
fill = "toself",name = "count of 1's",
mode = "markers+lines",
marker = dict(size = 5)
)
#count of 0's(No)
trace2 = go.Scatterpolar(r = data_frame_x["no"].values.tolist(),
theta = data_frame_x["feature"].tolist(),
fill = "toself",name = "count of 0's",
mode = "markers+lines",
marker = dict(size = 5)
)
layout = go.Layout(dict(polar = dict(radialaxis = dict(visible = True,
side = "counterclockwise",
showline = True,
linewidth = 2,
tickwidth = 2,
gridcolor = "white",
gridwidth = 2),
angularaxis = dict(tickfont = dict(size = 10),
layer = "below traces"
),
bgcolor = "rgb(243,243,243)",
),
paper_bgcolor = "rgb(243,243,243)",
title = title,height = 700))
plt_data = [trace2,trace1]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
#plot
plot_radar(dat_rad,1,"Attrition")
plot_radar(dat_rad,0,"Not Attrition")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
from yellowbrick.classifier import DiscriminationThreshold
train,test = train_test_split(data,test_size = .25 ,random_state = None)
cols = [i for i in data.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X = test[cols]
test_Y = test[target_col]
#Function attributes
#dataframe - processed dataframe
#Algorithm - Algorithm used
#training_x - predictor variables dataframe(training)
#testing_x - predictor variables dataframe(testing)
#training_y - target variable(training)
#training_y - target variable(testing)
#cf - ["coefficients","features"](cooefficients for logistic
#regression,features for tree based models)
#threshold_plot - if True returns threshold plot for model
def data_attrition_prediction(algorithm,training_x,testing_x,
training_y,testing_y,cols,cf,threshold_plot) :
#model
algorithm.fit(training_x,training_y)
predictions = algorithm.predict(testing_x)
probabilities = algorithm.predict_proba(testing_x)
#coeffs
if cf == "coefficients" :
coefficients = pd.DataFrame(algorithm.coef_.ravel())
elif cf == "features" :
coefficients = pd.DataFrame(algorithm.feature_importances_)
column_df = pd.DataFrame(cols)
coef_sumry = (pd.merge(coefficients,column_df,left_index= True,
right_index= True, how = "left"))
coef_sumry.columns = ["coefficients","features"]
coef_sumry = coef_sumry.sort_values(by = "coefficients",ascending = False)
print (algorithm)
print ("\n Classification report : \n",classification_report(testing_y,predictions))
print ("Accuracy Score : ",accuracy_score(testing_y,predictions))
#confusion matrix
conf_matrix = confusion_matrix(testing_y,predictions)
#roc_auc_score
model_roc_auc = roc_auc_score(testing_y,predictions)
print ("Area under curve : ",model_roc_auc,"\n")
fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
trace1 = go.Heatmap(z = conf_matrix ,
x = ["Not Attrition","Attrition"],
y = ["Not Attrition","Attrition"],
showscale = False,colorscale = "Picnic",
name = "matrix")
#plot roc curve
trace2 = go.Scatter(x = fpr,y = tpr,
name = "Roc : " + str(model_roc_auc),
line = dict(color = ('rgb(22, 96, 167)'),width = 2))
trace3 = go.Scatter(x = [0,1],y=[0,1],
line = dict(color = ('rgb(205, 12, 24)'),width = 2,
dash = 'dot'))
#plot coeffs
trace4 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
name = "coefficients",
marker = dict(color = coef_sumry["coefficients"],
colorscale = "Picnic",
line = dict(width = .6,color = "black")))
#subplots
fig = plotly.subplots.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
subplot_titles=('Confusion Matrix',
'Receiver operating characteristic',
'Feature Importances'))
fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,2)
fig.append_trace(trace3,1,2)
fig.append_trace(trace4,2,1)
fig['layout'].update(showlegend=False, title="Model performance" ,
autosize = False,height = 900,width = 800,
plot_bgcolor = 'rgba(240,240,240, 0.95)',
paper_bgcolor = 'rgba(240,240,240, 0.95)',
margin = dict(b = 195))
fig["layout"]["xaxis2"].update(dict(title = "false positive rate"))
fig["layout"]["yaxis2"].update(dict(title = "true positive rate"))
fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),
tickangle = 90))
py.iplot(fig)
if threshold_plot == True :
visualizer = DiscriminationThreshold(algorithm)
visualizer.fit(training_x,training_y)
visualizer.poof()
logit = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
data_attrition_prediction(logit,train_X,test_X,train_Y,test_Y,
cols,"coefficients",threshold_plot = True)
from imblearn.over_sampling import SMOTE
cols = [i for i in data.columns if i not in Id_col+target_col]
smote_X = data[cols]
smote_Y = data[target_col]
#Split train and test data
smote_train_X,smote_test_X,smote_train_Y,smote_test_Y = train_test_split(smote_X,smote_Y,
test_size = .25 ,
random_state = 111)
#oversampling minority class using smote
os = SMOTE(random_state = 0)
os_smote_X,os_smote_Y = os.fit_sample(smote_train_X,smote_train_Y)
os_smote_X = pd.DataFrame(data = os_smote_X,columns=cols)
os_smote_Y = pd.DataFrame(data = os_smote_Y,columns=target_col)
###
logit_smote = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
data_attrition_prediction(logit_smote,os_smote_X,test_X,os_smote_Y,test_Y,
cols,"coefficients",threshold_plot = True)
from sklearn.feature_selection import RFE
logit = LogisticRegression()
rfe = RFE(logit,10)
rfe = rfe.fit(os_smote_X,os_smote_Y.values.ravel())
rfe.support_
rfe.ranking_
#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
"columns" : [i for i in data.columns if i not in Id_col + target_col],
"ranking" : rfe.ranking_,
})
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()
#separating train and test data
train_rf_X = os_smote_X[cols]
train_rf_Y = os_smote_Y
test_rf_X = test[cols]
test_rf_Y = test[target_col]
logit_rfe = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
#applying model
data_attrition_prediction(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
cols,"coefficients",threshold_plot = True)
tab_rk = ff.create_table(idc_rfe)
py.iplot(tab_rk)
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
#select columns
cols = [i for i in data.columns if i not in Id_col + target_col ]
#dataframe with non negative values
df_x = df_data_og[cols]
df_y = df_data_og[target_col]
#fit model with k= 3
select = SelectKBest(score_func = chi2,k = 3)
fit = select.fit(df_x,df_y)
#Summerize scores
print ("scores")
print (fit.scores_)
print ("P - Values")
print (fit.pvalues_)
#create dataframe
score = pd.DataFrame({"features":cols,"scores":fit.scores_,"p_values":fit.pvalues_ })
score = score.sort_values(by = "scores" ,ascending =False)
#createing new label for categorical and numerical columns
score["feature_type"] = np.where(score["features"].isin(num_cols),"Numerical","Categorical")
#plot
trace = go.Scatter(x = score[score["feature_type"] == "Categorical"]["features"],
y = score[score["feature_type"] == "Categorical"]["scores"],
name = "Categorial",mode = "lines+markers",
marker = dict(color = "red",
line = dict(width =1))
)
trace1 = go.Bar(x = score[score["feature_type"] == "Numerical"]["features"],
y = score[score["feature_type"] == "Numerical"]["scores"],name = "Numerical",
marker = dict(color = "royalblue",
line = dict(width =1)),
xaxis = "x2",yaxis = "y2"
)
layout = go.Layout(dict(title = "Scores for Categorical & Numerical features",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
tickfont = dict(size =10),
domain=[0, 0.7],
tickangle = 90,zerolinewidth=1,
ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "scores",
zerolinewidth=1,ticklen=5,gridwidth=2),
margin = dict(b=200),
xaxis2=dict(domain=[0.8, 1],tickangle = 90,
gridcolor = 'rgb(255, 255, 255)'),
yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
)
)
plt_data=[trace,trace1]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from graphviz import Source
from IPython.display import SVG,display
#top 3 categorical features
features_cat = score[score["feature_type"] == "Categorical"]["features"][:3].tolist()
#top 3 numerical features
features_num = score[score["feature_type"] == "Numerical"]["features"][:3].tolist()
#Function attributes
#columns - selected columns
#maximum_depth - depth of tree
#criterion_type - ["gini" or "entropy"]
#split_type - ["best" or "random"]
#Model Performance - True (gives model output)
def plot_decision_tree(columns,maximum_depth,criterion_type,
split_type,model_performance = None) :
#separating dependent and in dependent variables
dtc_x = df_x[columns]
dtc_y = df_y[target_col]
#model
dt_classifier = DecisionTreeClassifier(max_depth = maximum_depth,
splitter = split_type,
criterion = criterion_type,
)
dt_classifier.fit(dtc_x,dtc_y)
#plot decision tree
graph = Source(tree.export_graphviz(dt_classifier,out_file=None,
rounded=True,proportion = False,
feature_names = columns,
precision = 2,
class_names=["Not Attrition","Attrition"],
filled = True
)
)
#model performance
if model_performance == True :
data_attrition_prediction(dt_classifier,
dtc_x,test_X[columns],
dtc_y,test_Y,
columns,"features",threshold_plot = True)
display(graph)
plot_decision_tree(features_num,3,"gini","best")
plot_decision_tree(features_cat,3,"entropy","best",
model_performance = True ,)
def data_Attrition_prediction_alg(algorithm,training_x,testing_x,
training_y,testing_y,threshold_plot = True) :
#model
algorithm.fit(training_x,training_y)
predictions = algorithm.predict(testing_x)
probabilities = algorithm.predict_proba(testing_x)
print (algorithm)
print ("\n Classification report : \n",classification_report(testing_y,predictions))
print ("Accuracy Score : ",accuracy_score(testing_y,predictions))
#confusion matrix
conf_matrix = confusion_matrix(testing_y,predictions)
#roc_auc_score
model_roc_auc = roc_auc_score(testing_y,predictions)
print ("Area under curve : ",model_roc_auc)
fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
#plot roc curve
trace1 = go.Scatter(x = fpr,y = tpr,
name = "Roc : " + str(model_roc_auc),
line = dict(color = ('rgb(22, 96, 167)'),width = 2),
)
trace2 = go.Scatter(x = [0,1],y=[0,1],
line = dict(color = ('rgb(205, 12, 24)'),width = 2,
dash = 'dot'))
#plot confusion matrix
trace3 = go.Heatmap(z = conf_matrix ,x = ["Not churn","Churn"],
y = ["Not churn","Churn"],
showscale = False,colorscale = "Blues",name = "matrix",
xaxis = "x2",yaxis = "y2"
)
layout = go.Layout(dict(title="Model performance" ,
autosize = False,height = 500,width = 800,
showlegend = False,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(title = "false positive rate",
gridcolor = 'rgb(255, 255, 255)',
domain=[0, 0.6],
ticklen=5,gridwidth=2),
yaxis = dict(title = "true positive rate",
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth=1,
ticklen=5,gridwidth=2),
margin = dict(b=200),
xaxis2=dict(domain=[0.7, 1],tickangle = 90,
gridcolor = 'rgb(255, 255, 255)'),
yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
)
)
plt_data = [trace1,trace2,trace3]
fig = go.Figure(data=plt_data,layout=layout)
py.iplot(fig)
if threshold_plot == True :
visualizer = DiscriminationThreshold(algorithm)
visualizer.fit(training_x,training_y)
visualizer.poof()
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
data_Attrition_prediction_alg(knn,os_smote_X,test_X,
os_smote_Y,test_Y,threshold_plot = True)
from sklearn.ensemble import RandomForestClassifier
#function attributes
#columns - column used
#nf_estimators - The number of trees in the forest.
#estimated_tree - tree number to be displayed
#maximum_depth - depth of the tree
#criterion_type - split criterion type ["gini" or "entropy"]
#Model performance - prints performance of model
def plot_tree_randomforest(columns,nf_estimators,
estimated_tree,maximum_depth,
criterion_type,model_performance = None) :
dataframe = df_data_og[columns + target_col].copy()
#train and test datasets
rf_x = dataframe[[i for i in columns if i not in target_col]]
rf_y = dataframe[target_col]
#random forest classifier
rfc = RandomForestClassifier(n_estimators = nf_estimators,
max_depth = maximum_depth,
criterion = criterion_type,
)
rfc.fit(rf_x,rf_y)
estimated_tree = rfc.estimators_[estimated_tree]
graph = Source(tree.export_graphviz(estimated_tree,out_file=None,
rounded=True,proportion = False,
feature_names = columns,
precision = 2,
class_names=["Not churn","Churn"],
filled = True))
display(graph)
#model performance
if model_performance == True :
data_attrition_prediction(rfc,
rf_x,test_X[columns],
rf_y,test_Y,
columns,"features",threshold_plot = True)
cols1 = [ i for i in train_X.columns if i not in target_col + Id_col]
plot_tree_randomforest(cols1,100,99,3,"entropy",True)
n = np.arange(0,10).tolist()
cols1 = [ i for i in train_X.columns if i not in target_col + Id_col]
for i in n :
plot_tree_randomforest(cols1,10,i,3,"entropy",model_performance=False)
n = np.arange(0,10).tolist()
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()
for i in n :
plot_tree_randomforest(cols,10,i,3,"gini",model_performance=False)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)
data_Attrition_prediction_alg(gnb,os_smote_X,test_X,os_smote_Y,test_Y)
from sklearn.svm import SVC
#Support vector classifier
#using linear hyper plane
svc_lin = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
max_iter=-1, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
cols = [i for i in data.columns if i not in Id_col + target_col]
data_attrition_prediction(svc_lin,os_smote_X,test_X,os_smote_Y,test_Y,
cols,"coefficients",threshold_plot = False)
#tuning parameters
#Support vector classifier
#using non-linear hyper plane("rbf")
svc_rbf = SVC(C=1.0, kernel='rbf',
degree= 3, gamma=1.0,
coef0=0.0, shrinking=True,
probability=True,tol=0.001,
cache_size=200, class_weight=None,
verbose=False,max_iter= -1,
random_state=None)
data_Attrition_prediction_alg(svc_rbf,os_smote_X,test_X,os_smote_Y,test_Y,threshold_plot = False)
from lightgbm import LGBMClassifier
lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
learning_rate=0.5, max_depth=7, min_child_samples=20,
min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
n_jobs=-1, num_leaves=500, objective='binary', random_state=None,
reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
subsample_for_bin=200000, subsample_freq=0)
cols = [i for i in data.columns if i not in Id_col + target_col]
data_attrition_prediction(lgbm_c,os_smote_X,test_X,os_smote_Y,test_Y,
cols,"features",threshold_plot = True)
from xgboost import XGBClassifier
xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
max_depth = 7, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)
data_attrition_prediction(xgc,os_smote_X,test_X,os_smote_Y,test_Y,
cols,"features",threshold_plot = True)
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
#gives model report in dataframe
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
model.fit(training_x,training_y)
predictions = model.predict(testing_x)
accuracy = accuracy_score(testing_y,predictions)
recallscore = recall_score(testing_y,predictions)
precision = precision_score(testing_y,predictions)
roc_auc = roc_auc_score(testing_y,predictions)
f1score = f1_score(testing_y,predictions)
kappa_metric = cohen_kappa_score(testing_y,predictions)
df = pd.DataFrame({"Model" : [name],
"Accuracy_score" : [accuracy],
"Recall_score" : [recallscore],
"Precision" : [precision],
"f1_score" : [f1score],
"Area_under_curve": [roc_auc],
"Kappa_metric" : [kappa_metric],
})
return df
#outputs for every model
model1 = model_report(logit,train_X,test_X,train_Y,test_Y,
"Logistic Regression(Baseline_model)")
model2 = model_report(logit_smote,os_smote_X,test_X,os_smote_Y,test_Y,
"Logistic Regression(SMOTE)")
model3 = model_report(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
"Logistic Regression(RFE)")
decision_tree = DecisionTreeClassifier(max_depth = 9,
random_state = 123,
splitter = "best",
criterion = "gini",
)
model4 = model_report(decision_tree,train_X,test_X,train_Y,test_Y,
"Decision Tree")
model5 = model_report(knn,os_smote_X,test_X,os_smote_Y,test_Y,
"KNN Classifier")
rfc = RandomForestClassifier(n_estimators = 1000,
random_state = 123,
max_depth = 9,
criterion = "gini")
model6 = model_report(rfc,train_X,test_X,train_Y,test_Y,
"Random Forest Classifier")
model7 = model_report(gnb,os_smote_X,test_X,os_smote_Y,test_Y,
"Naive Bayes")
model8 = model_report(svc_lin,os_smote_X,test_X,os_smote_Y,test_Y,
"SVM Classifier Linear")
model9 = model_report(svc_rbf,os_smote_X,test_X,os_smote_Y,test_Y,
"SVM Classifier RBF")
model10 = model_report(lgbm_c,os_smote_X,test_X,os_smote_Y,test_Y,
"LGBM Classifier")
model11 = model_report(xgc,os_smote_X,test_X,os_smote_Y,test_Y,
"XGBoost Classifier")
#concat all models
model_performances = pd.concat([model1,model2,model3,
model4,model5,model6,
model7,model8,model9,
model10,model11],axis = 0).reset_index()
model_performances = model_performances.drop(columns = "index",axis =1)
table = ff.create_table(np.round(model_performances,4))
py.iplot(table)
model_performances
def output_tracer(metric,color) :
tracer = go.Bar(y = model_performances["Model"] ,
x = model_performances[metric],
orientation = "h",name = metric ,
marker = dict(line = dict(width =.7),
color = color)
)
return tracer
layout = go.Layout(dict(title = "Model performances",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "metric",
zerolinewidth=1,
ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
zerolinewidth=1,ticklen=5,gridwidth=2),
margin = dict(l = 250),
height = 780
)
)
trace1 = output_tracer("Accuracy_score","#6699FF")
trace2 = output_tracer('Recall_score',"red")
trace3 = output_tracer('Precision',"#33CC99")
trace4 = output_tracer('f1_score',"lightgrey")
trace5 = output_tracer('Kappa_metric',"#FFCC99")
data = [trace1,trace2,trace3,trace4,trace5]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
lst = [logit,logit_smote,decision_tree,knn,rfc,
gnb,svc_lin,svc_rbf,lgbm_c,xgc]
length = len(lst)
mods = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
'XGBoost Classifier']
fig = plt.figure(figsize=(13,15))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
plt.subplot(4,3,j+1)
predictions = i.predict(test_X)
conf_matrix = confusion_matrix(predictions,test_Y)
sns.heatmap(conf_matrix,annot=True,fmt = "d",square = True,
xticklabels=["not churn","churn"],
yticklabels=["not churn","churn"],
linewidths = 2,linecolor = "w",cmap = "Set1")
plt.title(k,color = "b")
plt.subplots_adjust(wspace = .3,hspace = .3)
lst = [logit,logit_smote,decision_tree,knn,rfc,
gnb,svc_lin,svc_rbf,lgbm_c,xgc]
length = len(lst)
mods = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
'XGBoost Classifier']
plt.style.use("dark_background")
fig = plt.figure(figsize=(12,16))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
qx = plt.subplot(4,3,j+1)
probabilities = i.predict_proba(test_X)
predictions = i.predict(test_X)
fpr,tpr,thresholds = roc_curve(test_Y,probabilities[:,1])
plt.plot(fpr,tpr,linestyle = "dotted",
color = "royalblue",linewidth = 2,
label = "AUC = " + str(np.around(roc_auc_score(test_Y,predictions),3)))
plt.plot([0,1],[0,1],linestyle = "dashed",
color = "orangered",linewidth = 1.5)
plt.fill_between(fpr,tpr,alpha = .4)
plt.fill_between([0,1],[0,1],color = "k")
plt.legend(loc = "lower right",
prop = {"size" : 12})
qx.set_facecolor("k")
plt.grid(True,alpha = .15)
plt.title(k,color = "b")
plt.xticks(np.arange(0,1,.3))
plt.yticks(np.arange(0,1,.3))
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
lst = [logit,logit_smote,decision_tree,knn,rfc,
gnb,svc_lin,svc_rbf,lgbm_c,xgc]
length = len(lst)
mods = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
'XGBoost Classifier']
fig = plt.figure(figsize=(13,17))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
qx = plt.subplot(4,3,j+1)
probabilities = i.predict_proba(test_X)
predictions = i.predict(test_X)
recall,precision,thresholds = precision_recall_curve(test_Y,probabilities[:,1])
plt.plot(recall,precision,linewidth = 1.5,
label = ("avg_pcn : " +
str(np.around(average_precision_score(test_Y,predictions),3))))
plt.plot([0,1],[0,0],linestyle = "dashed")
plt.fill_between(recall,precision,alpha = .2)
plt.legend(loc = "lower left",
prop = {"size" : 10})
qx.set_facecolor("k")
plt.grid(True,alpha = .15)
plt.title(k,color = "b")
plt.xlabel("recall",fontsize =7)
plt.ylabel("precision",fontsize =7)
plt.xlim([0.25,1])
plt.yticks(np.arange(0,1,.3))